# Sentencing and Recidivism 
libraries <- c("tidyr", "tidyverse", "ggplot2", "boot", "caret", "gridExtra", "dplyr", "modelsummary")
for (library_name in libraries) {
  library(library_name, character.only = TRUE)
}
setwd("~/Documents/Journal articles/Identification")  
rm(list = ls())
# Load North Carolina Recidivism Dataset
reader = file("northcarolina1.txt", open = "r")
input = readLines(reader)
close(reader)
data = matrix(NA, nrow = 0, ncol = 28)
for(line in input){
  line = strtoi(strsplit(line, split = "")[[1]])
  if(line[28] != 3){
    data = rbind(data, line[1:28])
  }
}
reader = file("northcarolina2.txt", open = "r")
input = readLines(reader)
close(reader)
for(line in input){
  line = strtoi(strsplit(line, split = "")[[1]])
  if(line[28] != 3){
    data = rbind(data, line[1:28])
  }
}
data[is.na(data)] = 0
race = data[,1] # non-black = 1 - mean 51%
alcohol = data[,2] # alcohol problem = 1 - mean 29%
drugs = data[,3] # hard drug user = 1 - mean 23%
parole = data[,4] # parole = 1 - mean 76%
married = data[,5] # married = 1 - mean 25%
felony = data[,6] # felony = 1 - mean 37%
workrelease = data[,7] # work release = 1 - mean 44%
property = data[,8] # property crime = 1 - mean 36%
personal = data[,9] # personal crime = 1 - mean 9%
gender = data[,10] # male = 1 - mean 94%
priors = 10*data[,11] + data[,12] # mean 1.36
school = 10*data[,13] + data[,14] # mean 9.65
violations = 10*data[,15] + data[,16] # mean 1.36
age = (100*data[,17] + 10*data[,18] + data[,19])/12 # mean 28.52
prisontime = (100*data[,20] + 10*data[,21] + data[,22])/12 # mean 1.614
followup = (10*data[,23] + data[,24])/12 # mean 5.16
recid = data[,25] # mean 37%
returntime = (10*data[,26] + data[,27])/12 # mean 1.77, only for recid = 1
data.cov = data.frame(cbind(race, age, gender, alcohol, drugs, school, married, priors, felony, property, personal))
data.all = data.frame(cbind(recid, prisontime, data.cov))
data.all <- data.all #%>% mutate(id = row_number())
write.csv(data.all, "nc_recid.csv", row.names = TRUE)

# Descriptive Statistics
tmp_pre <- data.all[, c("recid", "prisontime", "race", "age", "gender", "alcohol", "drugs", "school", "married", "priors", "felony", "property", "personal")] %>% 
  select(`Recidivism` = recid, `Prison time` = prisontime, `Race` = race, `Age` = age, `Gender` = gender, `Alcohol` = alcohol, `Hard drug use` = drugs, `Schooling` = school, 
         `Marital status` = married, `Prior criminal record` = priors,`Felony crime` = felony, `Property crime` = property, `Personal crime` = personal)
datasummary(`Recidivism` + `Prison time` +`Race` +`Age` + `Gender` + `Alcohol` +`Hard drug use` + `Schooling` + `Marital status` + `Prior criminal record` + `Felony crime` + `Property crime` +
              `Personal crime` ~  Mean + SD + Min + Max, data=tmp_pre)

# Causal Fusion Logistic Regression (backdoor adjustment)
rm(list = ls())
logit_obs <- read_csv("/Users/firatbilgel/Documents/Journal articles/identification/nc_resid_logistic_BFPX_1/nc_recid (observational).csv", col_names = FALSE)
logit_exp <- read_csv("/Users/firatbilgel/Documents/Journal articles/identification/nc_resid_logistic_BFPX_1/nc_recid (experimental).csv", col_names = FALSE)
# Rename colum names as v1, v2,...v50
colnames(logit_obs) <- paste0("v", 1:ncol(logit_obs))
colnames(logit_exp) <- paste0("v", 1:ncol(logit_exp))
# Generate threshold values for the treatment from the first column
threshold_df <- logit_obs[1, ]
threshold_df <- threshold_df %>% pivot_longer(cols = everything(), names_to = c("variable", "stat"), names_pattern = "(v[0-9]+)_(.*)")
threshold_df <- threshold_df[, -c(1:2) ]
threshold_df <- threshold_df %>% rename("T" = "value")
# Drop the first row (threshold values)
logit_obs <- logit_obs[-1, ]
logit_exp <- logit_exp[-1, ]

for (i in 1:50) {
  logit_obs <- logit_obs %>% mutate(!!sym(paste0("v", i, "_lower")) := quantile(!!sym(paste0("v", i)), 0.025),
                                !!sym(paste0("v", i, "_upper")) := quantile(!!sym(paste0("v", i)), 0.975), !!sym(paste0("v", i, "_mean")) := mean(!!sym(paste0("v", i))))
  logit_exp <- logit_exp %>% mutate(!!sym(paste0("v", i, "_lower")) := quantile(!!sym(paste0("v", i)), 0.025),
                                !!sym(paste0("v", i, "_upper")) := quantile(!!sym(paste0("v", i)), 0.975), !!sym(paste0("v", i, "_mean")) := mean(!!sym(paste0("v", i))))
  
}
logit_obs <- logit_obs[, -c(1:50)]
logit_exp <- logit_exp[, -c(1:50)]
logit_obs <- logit_obs[1, ]
logit_exp <- logit_exp[1, ]
for (i in 1:50) {
  logit_obs <- logit_obs %>% mutate(!!sym(paste0("v", i, "_lcl")) := 2 * !!sym(paste0("v", i, "_mean")) - !!sym(paste0("v", i, "_upper")),
                                !!sym(paste0("v", i, "_ucl")) := 2 * !!sym(paste0("v", i, "_mean")) - !!sym(paste0("v", i, "_lower")))
  logit_exp <- logit_exp %>% mutate(!!sym(paste0("v", i, "_lcl")) := 2 * !!sym(paste0("v", i, "_mean")) - !!sym(paste0("v", i, "_upper")),
                                !!sym(paste0("v", i, "_ucl")) := 2 * !!sym(paste0("v", i, "_mean")) - !!sym(paste0("v", i, "_lower")))
  
}

logit_obs <- logit_obs %>% pivot_longer(cols = everything(), names_to = c("variable", "stat"), names_pattern = "(v[0-9]+)_(.*)")
logit_obs <- logit_obs %>% pivot_wider(names_from = stat, values_from = value)
logit_obs <- logit_obs[, -c(1:1)]
logit_obs <- cbind(logit_obs, threshold_df)
logit_obs <- logit_obs %>%mutate(mean_diff = c(0, diff(mean)))
logit_obs <- logit_obs %>% mutate(mean_pct_change = (mean - lag(mean)) / lag(mean)*100)

logit_exp <- logit_exp %>% pivot_longer(cols = everything(), names_to = c("variable", "stat"), names_pattern = "(v[0-9]+)_(.*)")
logit_exp <- logit_exp %>% pivot_wider(names_from = stat, values_from = value)
logit_exp <- logit_exp[, -c(1:1)]
logit_exp <- cbind(logit_exp, threshold_df)
logit_exp <- logit_exp %>%mutate(mean_diff = c(0, diff(mean)))
logit_exp <- logit_exp %>% mutate(mean_pct_change = (mean - lag(mean)) / lag(mean)*100)

# Observational plot
dark_lavender <- "#734F96"
cornell_red <- "#B31B1B"
secondary_line_color <- cornell_red

obs_plot <- ggplot(logit_obs, aes(x = T, y = mean)) + geom_ribbon(aes(ymin = lcl, ymax = ucl, fill = "95% CI"), alpha = 0.6) + geom_line(aes(color = "Pr(Y | T)"), size = 0.8) + 
  geom_line(aes(y = mean_pct_change/20, color = "% Change in probability"), size = 0.8) +  # Scale the mean_diff for visibility
  labs(y = "Pr(Y | T)", x = "Treatment", title = "Observational distribution") + theme_bw() + scale_color_manual(values = c("Pr(Y | T)" = dark_lavender, "% Change in probability"= secondary_line_color)) + 
  scale_fill_manual(values = c("95% CI" = "lavender")) +
  scale_y_continuous(name = "Pr(Y | T)", limits = c(-0.05, 1.3), breaks = seq(0, 1.3, by = 0.2),
                     sec.axis = sec_axis(~ . * 20, name = "% Change in probability", breaks = seq(-0.2, 8, by = 2))  # Define breaks for the secondary axis
  ) +
  scale_x_continuous(name = "Prison time (years)", limits = c(0, 23), breaks = seq(0, 23, by = 5)) +
  theme(
    legend.position = c(0.25, 0.825),
    axis.text.x = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1),
    axis.text.y = element_text(size = 10),
    axis.title = element_text(size = 10),
    strip.text = element_text(size = 10),
    strip.text.y = element_text(angle = 0),
    plot.title = element_text(size = 10, hjust = 0.5),
    legend.text = element_text(size = 6),
    legend.box.background = element_rect(colour = "black", fill = "white", linewidth = 0.3), 
    legend.spacing.x = unit(0, "mm"), 
    legend.spacing.y = unit(0, "mm"), 
    legend.box.margin = margin(unit(0.1, "cm"), unit(0.1, "cm"), unit(0.1, "cm"), unit(0.1, "cm"))) +
  guides(color = guide_legend(title = NULL), fill = guide_legend(title = NULL))
obs_plot <- obs_plot + guides(color = guide_legend(title = NULL, keyheight = unit(0.8, "lines")))
obs_plot

# Experimental plot
dark_green <- "#006600"
cornell_red <- "#B31B1B"
secondary_line_color <- cornell_red

exp_plot <- ggplot(logit_exp, aes(x = T)) + geom_ribbon(aes(ymin = lcl, ymax = ucl, fill = "95% CI"), alpha = 0.3) + geom_line(aes(y = mean, color = "Pr(Y | do(T))"), size = 0.8) + 
  geom_line(aes(y = mean_pct_change / 20, color = "% Change in probability"), size = 0.8) +  # Scale the mean_diff for visibility
  labs(y = "Pr(Y | do(T))", x = "Treatment", title = "Experimental distribution") + theme_bw() + 
  scale_color_manual(values = c("Pr(Y | do(T))" = dark_green, "% Change in probability" = secondary_line_color)) + scale_fill_manual(values = c("95% CI" = dark_green)) +
  scale_y_continuous(name = "Pr(Y | do(T))", limits = c(-0.05, 1.3), breaks = seq(0, 1.3, by = 0.2),
                     sec.axis = sec_axis(~ . * 20, name = "% Change in probability", breaks = seq(-0.2, 8, by = 2))  # Define breaks for the secondary axis
  ) + scale_x_continuous(name = "Prison time (years)", limits = c(0, 23), breaks = seq(0, 23, by = 5)) +
  theme(
    legend.position = c(0.25, 0.825),
    axis.text.x = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1),
    axis.text.y = element_text(size = 10),
    axis.title = element_text(size = 10),
    strip.text = element_text(size = 10),
    strip.text.y = element_text(angle = 0),
    plot.title = element_text(size = 10, hjust = 0.5),
    legend.text = element_text(size = 6),
    legend.box.background = element_rect(colour = "black", fill = "white", linewidth = 0.3), 
    legend.spacing.x = unit(0, "mm"), 
    legend.spacing.y = unit(0, "mm"), 
    legend.box.margin = margin(unit(0.1, "cm"), unit(0.1, "cm"), unit(0.1, "cm"), unit(0.1, "cm"))) +
  guides(color = guide_legend(title = NULL), fill = guide_legend(title = NULL))
exp_plot <- exp_plot + guides(color = guide_legend(title = NULL, keyheight = unit(0.8, "lines")))
exp_plot

# Save or display the combined plot
logit <- grid.arrange(exp_plot, obs_plot, ncol = 2, widths = c(4,4))
ggsave(filename="/Users/firatbilgel/Documents/Journal articles/identification/nc_resid_logistic_BFPX_1.png", xgb, width = 8, height = 4, dpi=500)
